Learning objective: Pre-processing
When you get genomic measurements, especially if you consider getting genomic measurements across multiple samples, they're often incomparable in various different ways. The machine that you use to collect the measurements might vary from day to day, or different liaisons might be used, and so these differences translate into differences in the data from sample to sample.
Pre-processing allow those samples comparable before statistical analyses.
One way to reduce dimension is to take the average in the rows and the columns
# color setup
library(devtools)
## Loading required package: usethis
library(Biobase)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
tropical = c("darkorange", "dodgerblue", "hotpink", "limegreen", "yellow")
palette(tropical)
par(pch=19)
library(preprocessCore)
# load dataset
con = url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/montpick_eset.RData")
load(file=con)
close(con)
mp = montpick.eset
pdata = pData(mp) # Phenotype data
edata = as.data.frame(exprs(mp)) # Expression data
fdata = fData(mp) # feature data
ls()
## [1] "con" "edata" "fdata" "montpick.eset"
## [5] "mp" "pdata" "tropical"
# Data modification
edata = edata[rowMeans(edata) > 100,] # substract row where rowMeans < 100
edata = log2(edata + 1) # log transformation + 1 as explained in Module 1
edata_centered = edata - rowMeans(edata) # if not removed first singular vector will always be the mean level
svd1 = svd(edata_centered)
names(svd1)
## [1] "d" "u" "v"
# plot singular values
plot(svd1$d, ylab="Singular value",col=2)
plot(svd1$d^2/sum(svd1$d^2),ylab="Percent Variance Explained", col=2)
# PC comparison
par(mfrow=c(1,2)) # split view plot
plot(svd1$v[,1],col=2,ylab="1st PC")
plot(svd1$v[,2],col=2,ylab="2nd PC")
par(mfrow=c(1,1))
plot(svd1$v[,1],svd1$v[,2],col=2,ylab="2nd PC",xlab="1st PC")
plot(svd1$v[,1],svd1$v[,2],ylab="2nd PC",xlab="1st PC",col=as.numeric(pdata$study))
# Alternatively showing with boxplot
boxplot(svd1$v[,1] ~ pdata$study,border=c(1,2))
points(svd1$v[,1] ~ jitter(as.numeric(pdata$study)), col=as.numeric(pdata$study))
# Do principle component
pc1 = prcomp(edata)
plot(pc1$rotation[,1],svd1$v[,1]) # They are not the same as they are not scaled in the same way
edata_centered2 = t(t(edata) - colMeans(edata))
svd2 = svd(edata_centered2)
plot(pc1$rotation[,1],svd2$v[,1],col=2) # Then they are exactly same to each other
# Investigate effect of outlier
edata_outlier = edata_centered
edata_outlier[6,] = edata_centered[6,] * 10000 # introducing artificial outliers
svd3 = svd(edata_outlier)
plot(svd1$v[,1],svd3$v[,1],xlab="Without outlier",ylab="With outlier")
plot(svd3$v[,1],edata_outlier[6,],col=4) # outlier in one gene expressed way higher than others.
When using this decomposition make sure you pick scaling and center so that all measure and features on common scale
Pre-processing is to add up all the reads and getting a number of each gene, for each sample
technique where they try to take replicate samples make sure that the bulk distributions look alike
Forces the distribution to be exactly the same as each other.
QN will force distribution to look exactly the same, but sometimes they should not look exactly the same
When QN, it sometimes makes sense to normalize within groups that are similar or should have similar distribution
edata = log2(edata+1)
edata = edata[rowMeans(edata)>3,]
dim(edata)
## [1] 2383 129
colramp = colorRampPalette(c(3,"white",2))(20)
plot(density(edata[,1]),col=colramp[1],lwd=3,ylim=c(0,.30))
for(i in 2:20){lines(density(edata[,i]),lwd=3,col=colramp[i])}
Differences may be from technology differences
norm_edata = normalize.quantiles(as.matrix(edata)) # force distribution to be exactly the same
plot(density(norm_edata[,1]),col=colramp[1],lwd=3,ylim=c(0,.30))
for(i in 2:20){lines(density(norm_edata[,i]),lwd=3,col=colramp[i])}
for most part, distribution lay exactly on top of each other. However, QN did not remove gene variability but bulk differences
plot(norm_edata[1,],col=as.numeric(pdata$study))
Can stil see the difference between two studies
svd1 = svd(norm_edata - rowMeans(norm_edata))
plot(svd1$v[,1],svd1$v[,2],col=as.numeric(pdata$study))
Even though we have done quantile normalization, we have not removed gene to gene variability.
$ C = b_0 + b_1P $
line does not perfectly describe the data
Another way to do this is to expand the equation
$ C = b_0 + b_1P + e $ # e is everything we did not measure fit by minimizing \(\sum(C-b_0-b_1P)^2\)
Does fitted line makes sense?
One way to do this is by taking residuals - Take the line and calculate the distance between every data point and actual line, and then make a plot
Ideally, you would like to see similar variability, no big outliers. and centered at zero
We can always fit a line, but line does not always make sense
In genomics, you often have either non-continuous outcomes or non-covariates.
Here, we will discuss continuous outcome, but a not continous covariate or a categorical covariate or a factor-level covariate
Many analyses fit the
By changing the covariate definition, we have changed the regression model
Basic thing to keep in mind is how many levels do you want to fit? What makes sense biologically?
In general in genomic study, you may have measured many covariates.
e.g. technological covariates such as batch effect and biological variables such as demographics of samples
These need adjustments for linear regression models
$ Hu_i = b_0 + b_1Y_i + b_2F_i + e_i $
One way to model error is to color the sample according to measurement. Then new regression model can be modeled.
This way you end up with two regression lines that are parallel to each other
\(b_0\) - percent hungry at year zero for males \(b_0+b_2\) - percent hungry at year zero for females \(b_1\) - change in percent hungry (for either males or females in one year) \(e_i\) - everything we did not measure
This is the way that you often fit these regression models. Careful about how you interpret the coefficients once you have fit adjustment variables especially if you are adjusting for many variables.
Expression = Baseline + RM Effect + BY Effect + (RM effect * BY Effect) + Noise
If fitting these more complicated regression models, it is worth taking a worth while to think exactly what does each of the beta coefficients mean.
Keep in mind, how many levels do you want to fit and what makes sense biologically
tropical = c("darkorange", "dodgerblue", "hotpink", "limegreen", "yellow")
palette(tropical)
par(pch=19)
library(devtools)
library(Biobase)
library(broom)
Load body map database
con = url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData")
load(file=con)
close(con)
bm = bodymap.eset
pdata = pData(bm)
edata = as.data.frame(exprs(bm))
fdata = fData(bm)
ls()
## [1] "bm" "bodymap.eset" "colramp" "con"
## [5] "edata" "edata_centered" "edata_centered2" "edata_outlier"
## [9] "fdata" "i" "montpick.eset" "mp"
## [13] "norm_edata" "pc1" "pdata" "svd1"
## [17] "svd2" "svd3" "tropical"
First thing, convert the edata into a matrix (easier to deal with values on numeric scale). You can fit a linear model by lm command. We can git the gene by gene, taking the first gene of the expression data. Then relate it using tilde operator to any variable
edata = as.matrix(edata)
lm1 = lm(edata[1,] ~ pdata$age)
tidy(lm1)
plot(pdata$age, edata[1,], col=1)
abline(lm1, col=2, lwd=3)
Relationship between gender
table(pdata$gender)
##
## F M
## 8 8
boxplot(edata[1,] ~ pdata$gender)
points(edata[1,] ~ jitter(as.numeric(pdata$gender)), col=as.numeric(pdata$gender))
but how do we quantify?
dummy_m = pdata$gender == "M"
dummy_f = pdata$gender == "F"
lm2 = lm(edata[1,] ~ pdata$gender)
tidy(lm2)
mod2 = model.matrix(~pdata$gender)
mod2
## (Intercept) pdata$genderM
## 1 1 0
## 2 1 1
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 1
## 7 1 0
## 8 1 1
## 9 1 1
## 10 1 0
## 14 1 0
## 15 1 1
## 16 1 1
## 17 1 1
## 18 1 0
## 19 1 1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$`pdata$gender`
## [1] "contr.treatment"
We can do this with more category
table(pdata$tissue.type)
##
## adipose adrenal brain breast
## 1 1 1 1
## colon heart kidney liver
## 1 1 1 1
## lung lymphnode mixture ovary
## 1 1 3 1
## prostate skeletal_muscle testes thyroid
## 1 1 1 1
## white_blood_cell
## 1
pdata$tissue.type == "adipose"
## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
pdata$tissue.type == "adrenal"
## [1] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
tidy(lm(edata[1,] ~ pdata$tissue.type))
adjust for variables
lm3 = lm(edata[1,] ~ pdata$age + pdata$gender)
tidy(lm3)
lm4 = lm(edata[1,] ~ pdata$age*pdata$gender)
tidy(lm4)
lm4 = lm(edata[6,] ~ pdata$age)
plot(pdata$age, edata[6,], col=2)
abline(lm4, col=1, lwd=3)
check outlier case
index = 1:19
lm5 = lm(edata[6,] ~ index)
plot(index, edata[6,], col=2)
abline(lm5, col=1, lwd=3)
lm6 = lm(edata[6,-19] ~ index[-19])
abline(lm6, col=3, lwd=3)
legend(5,1000, c("With outlier", "Without outlier"), col=c(1,3), lwd=3)
par(mfrow=c(1,2))
hist(lm6$residuals, col=2)
hist(lm5$residuals, col=3)
gene1 = log2(edata[1,]+1)
lm7 = lm(gene1 ~ index)
hist(lm7$residuals, col=4)
lm8 = lm(gene1 ~ pdata$tissue.type+pdata$age)
tidy(lm8)
dim(model.matrix(~ pdata$tissue.type+pdata$age))
## [1] 16 18
you are fitting too many data points.
colramp = colorRampPalette(1:4)(17)
lm9 = lm(edata[2,] ~ pdata$age)
plot(lm9$residuals, col=colramp[as.numeric(pdata$tissue.type)])
You would like to associate each feature with case control status and you would like to discover those features that are differentially expressed or differentially associated with those different conditions.
Every single row of this matrix, you will fit a regression model that has some B coefficients multiplied by some design matrix, multipled by some variables, S(Y), that you care about, plus some corresponding error term (E) for just that gene.
There is much more subtle effect. Intensity dependent effects in the measurements from the genomic data or dye effects or probe composition effects since this is microarray, and many other unknown variables needs to be modeled. When you do this, it becomes a slightly more complicated model.
We need to think linear models as one tool can be applied many many times across many different samples.
tropical = c("darkorange", "dodgerblue", "hotpink", "limegreen", "yellow")
palette(tropical)
par(pch=19)
library(devtools)
library(Biobase)
library(limma)
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(edge)
con = url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bottomly_eset.RData")
load(file=con)
close(con)
bot = bottomly.eset
pdata = pData(bot)
edata = as.matrix(exprs(bot))
fdata = fData(bot)
ls()
## [1] "bm" "bodymap.eset" "bot" "bottomly.eset"
## [5] "colramp" "con" "dummy_f" "dummy_m"
## [9] "edata" "edata_centered" "edata_centered2" "edata_outlier"
## [13] "fdata" "gene1" "i" "index"
## [17] "lm1" "lm2" "lm3" "lm4"
## [21] "lm5" "lm6" "lm7" "lm8"
## [25] "lm9" "mod2" "montpick.eset" "mp"
## [29] "norm_edata" "pc1" "pdata" "svd1"
## [33] "svd2" "svd3" "tropical"
remove lowly expressed gene
edata = log2(as.matrix(edata) + 1)
edata = edata[rowMeans(edata) > 10,]
mod = model.matrix(~pdata$strain)
fit = lm.fit(mod,t(edata))
names(fit)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
fit$coefficients[,1]
## (Intercept) pdata$strainDBA/2J
## 10.4116634 0.3478919
look at distribution of coefficients
par(mfrow=c(1,2))
hist(fit$coefficients[1,],breaks=100,col=2,xlab="Intercept")
hist(fit$coefficients[2,],breaks=100,col=2,xlab="Intercept")
plot(fit$residuals[,1])
plot(fit$residuals[,2])
fit adjusted model
mod_adj = model.matrix(~pdata$strain + as.factor(pdata$lane.number))
fit_adj = lm.fit(mod_adj,t(edata))
fit_adj$coefficient[,1]
## (Intercept) pdata$strainDBA/2J
## 10.31359781 0.28934825
## as.factor(pdata$lane.number)2 as.factor(pdata$lane.number)3
## 0.05451431 0.02502244
## as.factor(pdata$lane.number)5 as.factor(pdata$lane.number)6
## 0.07200502 0.38038016
## as.factor(pdata$lane.number)7 as.factor(pdata$lane.number)8
## 0.21815863 0.15103858
limma to fit model
fit_limma = lmFit(edata, mod_adj)
names(fit_limma)
## [1] "coefficients" "rank" "assign" "qr"
## [5] "df.residual" "sigma" "cov.coefficients" "stdev.unscaled"
## [9] "pivot" "Amean" "method" "design"
fit_limma$coefficients[1,]
## (Intercept) pdata$strainDBA/2J
## 10.31359781 0.28934825
## as.factor(pdata$lane.number)2 as.factor(pdata$lane.number)3
## 0.05451431 0.02502244
## as.factor(pdata$lane.number)5 as.factor(pdata$lane.number)6
## 0.07200502 0.38038016
## as.factor(pdata$lane.number)7 as.factor(pdata$lane.number)8
## 0.21815863 0.15103858
edge_study is good for when you don't have good knowledge on model matrix
edge_study = build_study(data=edata,grp=pdata$strain, adj.var = as.factor(pdata$lane.number))
fit_edge = fit_models(edge_study)
summary(fit_edge)
##
## deFit Summary
##
## fit.full:
## SRX033480 SRX033488 SRX033481 SRX033489 SRX033482 SRX033490
## ENSMUSG00000000131 10.3 10.3 10.4 10.4 10.3 10.3
## ENSMUSG00000000149 10.6 10.6 10.8 10.8 10.7 10.7
## ENSMUSG00000000374 10.7 10.7 10.7 10.7 10.7 10.7
## SRX033483 SRX033476 SRX033478 SRX033479 SRX033472 SRX033473
## ENSMUSG00000000131 10.4 10.7 10.5 10.5 10.6 10.7
## ENSMUSG00000000149 10.7 11.2 10.9 10.9 10.6 10.7
## ENSMUSG00000000374 10.7 11.0 10.8 10.8 10.9 10.9
## SRX033474 SRX033475 SRX033491 SRX033484 SRX033492 SRX033485
## ENSMUSG00000000131 10.6 10.7 10.7 11.0 11.0 10.8
## ENSMUSG00000000149 10.6 10.6 10.6 11.1 11.1 10.9
## ENSMUSG00000000374 10.9 10.9 10.9 11.2 11.2 11.0
## SRX033493 SRX033486 SRX033494
## ENSMUSG00000000131 10.8 10.8 10.8
## ENSMUSG00000000149 10.9 10.8 10.8
## ENSMUSG00000000374 11.0 10.9 10.9
##
## fit.null:
## SRX033480 SRX033488 SRX033481 SRX033489 SRX033482 SRX033490
## ENSMUSG00000000131 10.4 10.4 10.5 10.5 10.4 10.4
## ENSMUSG00000000149 10.6 10.6 10.8 10.8 10.7 10.7
## ENSMUSG00000000374 10.8 10.8 10.8 10.8 10.7 10.7
## SRX033483 SRX033476 SRX033478 SRX033479 SRX033472 SRX033473
## ENSMUSG00000000131 10.6 10.9 10.7 10.7 10.4 10.5
## ENSMUSG00000000149 10.6 11.1 10.9 10.8 10.6 10.8
## ENSMUSG00000000374 10.8 11.1 10.9 10.9 10.8 10.8
## SRX033474 SRX033475 SRX033491 SRX033484 SRX033492 SRX033485
## ENSMUSG00000000131 10.4 10.6 10.6 10.9 10.9 10.7
## ENSMUSG00000000149 10.7 10.6 10.6 11.1 11.1 10.9
## ENSMUSG00000000374 10.7 10.8 10.8 11.1 11.1 10.9
## SRX033493 SRX033486 SRX033494
## ENSMUSG00000000131 10.7 10.7 10.7
## ENSMUSG00000000149 10.9 10.8 10.8
## ENSMUSG00000000374 10.9 10.9 10.9
##
## res.full:
## SRX033480 SRX033488 SRX033481 SRX033489 SRX033482 SRX033490
## ENSMUSG00000000131 -0.567 0.616 -0.806 0.654 -0.498 0.748
## ENSMUSG00000000149 -0.793 0.602 -0.986 0.551 -0.523 0.636
## ENSMUSG00000000374 -0.491 0.620 -0.697 0.646 -0.427 0.756
## SRX033483 SRX033476 SRX033478 SRX033479 SRX033472 SRX033473
## ENSMUSG00000000131 -0.231 0.0237 0.00934 0.0511 -0.0493 0.1523
## ENSMUSG00000000149 -0.362 0.5347 0.20909 0.1327 0.1911 0.4355
## ENSMUSG00000000374 -0.232 -0.0977 -0.09356 0.0165 -0.1285 0.0506
## SRX033474 SRX033475 SRX033491 SRX033484 SRX033492 SRX033485
## ENSMUSG00000000131 -0.250 -0.345 0.576 -0.408 0.3841 -0.470
## ENSMUSG00000000149 -0.112 -0.347 0.709 -0.516 -0.0182 -0.601
## ENSMUSG00000000374 -0.329 -0.455 0.687 -0.325 0.4227 -0.401
## SRX033493 SRX033486 SRX033494
## ENSMUSG00000000131 0.461 -0.284 0.233
## ENSMUSG00000000149 0.392 -0.615 0.482
## ENSMUSG00000000374 0.495 -0.328 0.312
##
## res.null:
## SRX033480 SRX033488 SRX033481 SRX033489 SRX033482 SRX033490
## ENSMUSG00000000131 -0.664 0.520 -0.902 0.557 -0.594 0.651
## ENSMUSG00000000149 -0.764 0.631 -0.957 0.580 -0.494 0.665
## ENSMUSG00000000374 -0.558 0.554 -0.763 0.580 -0.493 0.689
## SRX033483 SRX033476 SRX033478 SRX033479 SRX033472 SRX033473
## ENSMUSG00000000131 -0.424 -0.169 -0.184 -0.142 0.1436 0.345
## ENSMUSG00000000149 -0.303 0.593 0.268 0.191 0.1324 0.377
## ENSMUSG00000000374 -0.365 -0.231 -0.226 -0.116 0.0044 0.183
## SRX033474 SRX033475 SRX033491 SRX033484 SRX033492 SRX033485
## ENSMUSG00000000131 -0.0568 -0.249 0.672 -0.311 0.4805 -0.374
## ENSMUSG00000000149 -0.1709 -0.376 0.680 -0.546 -0.0476 -0.630
## ENSMUSG00000000374 -0.1960 -0.389 0.754 -0.259 0.4892 -0.335
## SRX033493 SRX033486 SRX033494
## ENSMUSG00000000131 0.557 -0.188 0.330
## ENSMUSG00000000149 0.363 -0.645 0.453
## ENSMUSG00000000374 0.561 -0.262 0.378
##
## beta.coef:
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## ENSMUSG00000000131 10.3 0.0545 0.0250 0.0720 0.380 0.218 0.1510 0.2893
## ENSMUSG00000000149 10.6 0.1455 0.0784 0.0596 0.531 0.304 0.2083 -0.0881
## ENSMUSG00000000374 10.7 0.0322 -0.0246 -0.0164 0.273 0.121 0.0576 0.1993
##
## stat.type:
## [1] "lrt"
If each biological group is run on its own batch then it's impossible to tell the differences between group biology and batch variable.
If run replicates of the different groups on the different batches, it's possible to distinguish the difference between the batch effects and group effects.
First thing to dealing with these batch effects is a good study design and you get randomization of samples.
people fit regression model to model the effective batch. This only works if there are not intense correlations.
Y= b0 + b1P + b2B + e
Y is genomic measurement P is phenotype B is batch variable
Shrink down the estimate toward their common mean. If you don't know what the batch effects are. This is common in genomics experiment where batch effects could be due to a large number of things.
Data = primary variables + random variation (sample error or batch effects), so normally decompose this to random indenpendent variation and dependent variation.
Data = primary variables + dependent variation + independent variation. The dependent variation can further be divided into estimated batch variable.
The idea here is to estimate batch from the data itself and the algorithm is "Surrogate Variable Analysis"
tropical = c("darkorange", "dodgerblue", "hotpink", "limegreen", "yellow")
palette(tropical)
par(pch=19)
library(devtools)
library(Biobase)
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## Loading required package: BiocParallel
library(bladderbatch)
library(snpStats)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:edge':
##
## kidney
## Loading required package: Matrix
data(bladderdata)
ls()
## [1] "bladderEset" "bm" "bodymap.eset" "bot"
## [5] "bottomly.eset" "colramp" "con" "dummy_f"
## [9] "dummy_m" "edata" "edata_centered" "edata_centered2"
## [13] "edata_outlier" "edge_study" "fdata" "fit"
## [17] "fit_adj" "fit_edge" "fit_limma" "gene1"
## [21] "i" "index" "lm1" "lm2"
## [25] "lm3" "lm4" "lm5" "lm6"
## [29] "lm7" "lm8" "lm9" "mod"
## [33] "mod_adj" "mod2" "montpick.eset" "mp"
## [37] "norm_edata" "pc1" "pdata" "svd1"
## [41] "svd2" "svd3" "tropical"
pheno = pData(bladderEset)
edata = exprs(bladderEset)
head(edata)
## GSM71019.CEL GSM71020.CEL GSM71021.CEL GSM71022.CEL GSM71023.CEL
## 1007_s_at 10.115170 8.628044 8.779235 9.248569 10.256841
## 1053_at 5.345168 5.063598 5.113116 5.179410 5.181383
## 117_at 6.348024 6.663625 6.465892 6.116422 5.980457
## 121_at 8.901739 9.439977 9.540738 9.254368 8.798086
## 1255_g_at 3.967672 4.466027 4.144885 4.189338 4.078509
## 1294_at 7.775183 7.110154 7.248430 7.017220 7.896419
## GSM71024.CEL GSM71025.CEL GSM71026.CEL GSM71028.CEL GSM71029.CEL
## 1007_s_at 10.023133 9.108034 8.735616 9.803271 10.168602
## 1053_at 5.248418 5.252312 5.220931 5.595771 5.025180
## 117_at 5.796155 6.414849 6.846798 5.841478 6.352257
## 121_at 8.002870 9.093704 9.263386 7.789240 9.834564
## 1255_g_at 3.919740 4.402590 4.173666 3.590649 4.338196
## 1294_at 7.944676 7.469767 7.281925 7.367814 7.825735
## GSM71030.CEL GSM71031.CEL GSM71032.CEL GSM71033.CEL GSM71034.CEL
## 1007_s_at 8.420904 10.194269 10.184201 9.621902 10.330774
## 1053_at 5.909075 5.307699 5.311109 5.542951 5.458996
## 117_at 5.967566 6.171909 6.156392 6.120551 5.571636
## 121_at 7.844720 7.862812 8.387105 8.646292 8.395863
## 1255_g_at 3.952783 3.985398 3.863569 3.923745 3.815636
## 1294_at 7.401377 8.252998 7.564053 7.009706 8.772693
## GSM71035.CEL GSM71036.CEL GSM71037.CEL GSM71038.CEL GSM71039.CEL
## 1007_s_at 8.986572 10.639548 10.054243 9.768019 9.328306
## 1053_at 5.430052 5.587119 5.493452 5.633127 5.159921
## 117_at 6.000733 6.187920 6.114563 7.011835 6.336885
## 121_at 7.813381 8.752571 8.576715 8.254420 7.959826
## 1255_g_at 3.813763 3.928739 3.942883 3.764742 4.097321
## 1294_at 7.728379 7.553263 7.197903 8.149383 7.636171
## GSM71040.CEL GSM71041.CEL GSM71042.CEL GSM71043.CEL GSM71044.CEL
## 1007_s_at 10.196158 10.292591 10.320349 9.323106 9.628180
## 1053_at 6.076057 5.338113 5.187639 5.955838 5.450218
## 117_at 6.092187 6.088709 6.303387 5.888935 8.477193
## 121_at 8.096221 8.158816 8.598261 7.389635 8.508515
## 1255_g_at 3.774978 3.893144 4.065195 3.633987 3.967857
## 1294_at 8.028049 7.684338 8.122264 7.019915 8.083954
## GSM71045.CEL GSM71046.CEL GSM71047.CEL GSM71048.CEL GSM71049.CEL
## 1007_s_at 10.493403 10.840534 9.368271 10.337764 9.803385
## 1053_at 5.366804 5.437124 5.793530 5.247940 5.320422
## 117_at 6.011152 5.903212 6.313240 5.729220 6.161011
## 121_at 8.336089 7.792873 8.990317 8.976696 8.439951
## 1255_g_at 3.836356 3.799026 4.054613 3.844070 4.202248
## 1294_at 7.964711 7.862600 7.303318 8.013374 7.820776
## GSM71050.CEL GSM71051.CEL GSM71052.CEL GSM71053.CEL GSM71054.CEL
## 1007_s_at 10.158010 9.096022 9.287650 9.636696 9.911038
## 1053_at 5.826067 5.265145 5.391201 5.677830 5.373810
## 117_at 5.944079 6.727406 6.860623 5.862206 6.032169
## 121_at 8.259074 8.992336 8.617814 8.373513 8.227620
## 1255_g_at 3.865914 3.897634 4.019904 3.815712 3.841906
## 1294_at 8.499586 7.015544 7.358916 7.885461 7.288989
## GSM71055.CEL GSM71056.CEL GSM71058.CEL GSM71059.CEL GSM71060.CEL
## 1007_s_at 10.505014 10.417704 9.911863 10.545780 10.131537
## 1053_at 5.441140 5.579827 5.395288 5.524846 5.901671
## 117_at 6.033868 6.077683 6.791961 6.157637 6.058080
## 121_at 8.946566 8.505293 8.291846 8.214104 7.917774
## 1255_g_at 4.017547 3.879476 3.890512 3.894170 3.547688
## 1294_at 7.952671 7.490074 7.450992 8.086814 7.614454
## GSM71061.CEL GSM71062.CEL GSM71063.CEL GSM71064.CEL GSM71065.CEL
## 1007_s_at 9.712869 10.401919 8.763484 9.994538 9.790791
## 1053_at 5.841468 5.656442 5.723440 5.727089 5.484076
## 117_at 6.339688 5.648701 5.211550 6.177668 6.398325
## 121_at 7.968398 8.835210 7.469142 8.256623 8.211274
## 1255_g_at 3.831119 3.706997 3.668804 3.823414 4.164475
## 1294_at 7.676996 8.290899 6.749402 8.355787 7.311462
## GSM71066.CEL GSM71067.CEL GSM71068.CEL GSM71069.CEL GSM71070.CEL
## 1007_s_at 10.292308 10.627200 10.582892 10.009028 9.912853
## 1053_at 5.304608 5.491903 5.615926 5.151548 5.237126
## 117_at 5.891567 6.383317 5.913488 5.904794 5.960948
## 121_at 8.532695 8.016517 8.049998 8.407351 8.985741
## 1255_g_at 3.824158 3.783804 3.775194 3.995371 4.322380
## 1294_at 7.876914 8.554634 8.042399 7.680460 7.199866
## GSM71071.CEL GSM71072.CEL GSM71073.CEL GSM71074.CEL GSM71075.CEL
## 1007_s_at 9.096809 9.011927 8.396062 8.903465 9.501538
## 1053_at 5.093278 5.353248 5.214357 5.251383 5.223598
## 117_at 6.394089 6.425034 6.372520 6.095344 5.811968
## 121_at 8.817789 8.866083 8.704385 9.375736 8.580523
## 1255_g_at 4.141255 3.997644 4.219360 4.454771 4.188310
## 1294_at 7.378438 7.354380 7.179849 7.143989 7.136764
## GSM71076.CEL GSM71077.CEL
## 1007_s_at 9.540766 9.039143
## 1053_at 5.191392 5.235880
## 117_at 6.007461 6.314809
## 121_at 8.848099 9.663298
## 1255_g_at 4.284053 4.877523
## 1294_at 7.369991 6.992046
head(pheno)
If you have batch variable, you can adjust it directly.
mod = model.matrix(~as.factor(cancer) + as.factor(batch), data=pheno)
fit = lm.fit(mod, t(edata))
hist(fit$coefficients[2,],col=2,breaks=100)
Another approach is to use "Combat" which is similar approach to direct linear adjustment.
batch = pheno$batch
modcombat = model.matrix(~1,data=pheno)
modcancer = model.matrix(~cancer, data=pheno)
combat_edata = ComBat(dat=edata, batch=batch, mod=modcombat, par.prior=TRUE, prior.plots = FALSE)
## Found5batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
combat_fit = lm.fit(modcancer,t(combat_edata))
hist(combat_fit$coefficient[2,],col=2,breaks=100)
plot(fit$coefficients[2,],combat_fit$coefficients[2,],col=2,
xlab="Linear Model", ylab="Combat", xlim=c(-5,5),ylim=c(-5,5))
abline(c(0,1),col=1,lwd=3)
If you don't have batch variable. You might want to infer the batch variable with sva package.
mod = model.matrix(~cancer, data=pheno)
mod0 = model.matrix(~1,data=pheno)
sval = sva(edata,mod,mod0,n.sv=2)
## Number of significant surrogate variables is: 2
## Iteration (out of 5 ):1 2 3 4 5
names(sval)
## [1] "sv" "pprob.gam" "pprob.b" "n.sv"
dim(sval$sv) # new covariants that are potential batch effect
## [1] 57 2
correlate batch effect with observed batch variants
summary(lm(sval$sv ~ pheno$batch))
## Response Y1 :
##
## Call:
## lm(formula = Y1 ~ pheno$batch)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.26953 -0.11076 0.00787 0.10399 0.19069
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.018470 0.038694 -0.477 0.635
## pheno$batch 0.006051 0.011253 0.538 0.593
##
## Residual standard error: 0.1345 on 55 degrees of freedom
## Multiple R-squared: 0.00523, Adjusted R-squared: -0.01286
## F-statistic: 0.2891 on 1 and 55 DF, p-value: 0.5929
##
##
## Response Y2 :
##
## Call:
## lm(formula = Y2 ~ pheno$batch)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.23973 -0.07467 -0.02157 0.08116 0.25629
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.121112 0.034157 3.546 0.000808 ***
## pheno$batch -0.039675 0.009933 -3.994 0.000194 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1187 on 55 degrees of freedom
## Multiple R-squared: 0.2248, Adjusted R-squared: 0.2107
## F-statistic: 15.95 on 1 and 55 DF, p-value: 0.0001945
surrogate variable versus the batch effects in box plot
boxplot(sval$sv[,2] ~ pheno$batch)
points(sval$sv[,2] ~ jitter(as.numeric(pheno$batch)), col=as.numeric(pheno$batch))
what we've done here with the SVA is not necessarily actually cleaning the data set. We've just identified new covariates that we now need to include in our model fit
modsv = cbind(mod,sval$sv)
fitsv = lm.fit(modsv, t(edata))
compare different model
par(mfrow=c(1,2))
plot(fitsv$coefficients[2,],combat_fit$coefficients[2,],col=2,
xlab="SVA", ylab="Combat", xlim=c(-5,5), ylim=c(-5,5))
abline(c(0,1),col=1,lwd=3)
plot(fitsv$coefficients[2,], fit$coefficients[2,],col=2,
xlab="SVA", ylab="Linear Model", xlim=c(-5,5), ylim=c(-5,5))
abline(c(0,1), col=1,lwd=3)
tropical = c("darkorange", "dodgerblue", "hotpink", "limegreen", "yellow")
palette(tropical)
par(pch=19)
data(for.exercise)
controls <- rownames(subject.support)[subject.support$cc==0]
use <- seq(1,ncol(snps.10), 10)
ctl.10 <- snps.10[controls,use]
calculate principle components
xxmat <- xxt(ctl.10, correct.for.missing=FALSE)
evv <- eigen(xxmat, symmetric=TRUE)
pcs <- evv$vectors[,1:5]
Look at what population they come from
pop <- subject.support[controls,"stratum"]
plot(pcs[,1],pcs[,2],col=as.numeric(pop),
xlab="PC1",ylab="PC2")
legend(0,0.15,legend=levels(pop),pch=19,col=1:2)